{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import nltk\n",
    "from nltk.probability import FreqDist\n",
    "from nltk.tokenize import RegexpTokenizer\n",
    "tokenizer = RegexpTokenizer(r'\\w+')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 205147 entries, 0 to 205146\n",
      "Data columns (total 3 columns):\n",
      "title           205146 non-null object\n",
      "total_shares    205147 non-null int64\n",
      "url             205147 non-null object\n",
      "dtypes: int64(1), object(2)\n",
      "memory usage: 4.7+ MB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "data = pd.read_csv(\"alltopcontent.csv\")\n",
    "data = data[[\"title\",\"total_shares\", \"url\"]]\n",
    "print(data.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "allwords = []\n",
    "for i in range(0,205146):\n",
    "    title = data['title'][i]\n",
    "    title = str(title)\n",
    "    title = title.lower()\n",
    "    titlewords = tokenizer.tokenize(title)\n",
    "    for i in titlewords:\n",
    "        allwords.append(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['which', 'birth', 'dates', 'are', 'most', 'common', 'legacy', 'of', 'discord', 'disturbed', 'the', 'sound', 'of', 'silence', 'official', 'music', 'video', 'ed', 'sheeran', 'thinking', 'out', 'loud', 'official', 'video', 'ed', 'sheeran', 'shape', 'of', 'you', 'official', 'video', 'newcastle', 'school', 'boy', 'aged', '12', 'has', 'been', 'found', 'safe', 'and', 'well', 'police', 'hour', 'the', 'secret', 'society', 'hidden', 'mystery', 'wiz', 'khalifa', 'see', 'you', 'again', 'ft', 'charlie', 'puth', 'official', 'video', 'furious', '7', 'soundtrack', 'coldplay', 'hymn', 'for', 'the', 'weekend', 'official', 'video', 'no', 'ads', 'found', '12', '14', 'year', 'olds', 'found', 'safe', 'and', 'well', 'after', 'extensive', 'police', 'search', 'police', 'hour', 'linkin', 'park', 'singer', 'chester', 'bennington', 'dead', 'commits', 'suicide', 'by', 'hanging', 'loyal', 'employees', 'are', 'your']\n"
     ]
    }
   ],
   "source": [
    "print(allwords[0:100])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
